Machine learning project

Authors: Jose Pérez Cano & Álvaro Ribot Barrado

0. Libraries

install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# 3d
library(rgl)

# LASSO
library(Matrix)
library(glmnet)
Loading required package: foreach
Loaded glmnet 2.0-16
# Correspondence analysis
library(ca)

1. Read data

set.seed(2105)
setwd("../data")
The working directory was changed to /Users/joseperezcano/Desktop/CFIS segundo curso/AA1/Project/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
clev <- read.csv("cleveland.csv", header=F)

2. Preprocess data

We apply clustering and several plotting techniques to have an idea of the dataset. In case there are NAs we will use k-NN for imputation.

To extract new features we will apply PCA and FDA and keep this new features and components apart.

# It says which columns are all missings
# The index are returned in negative to eliminate them
na.columns <- function(dd){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (min(dd[,i]) == max(dd[,i]) & min(dd[,i])==-9){
      rmlist <- c(rmlist, i)
    }
  }
  -rmlist
}

clev <- clev[,na.columns(clev)]

# Returns columns with more NA than a given threshold, also in negative
much.na.cols <- function(dd, threshold){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (sum(dd[,i]==-9) > threshold){
      rmlist <- c(rmlist, i)
    }
  }   
  -rmlist  
}

clev <- clev[, much.na.cols(clev, 60)]

# Applies k-nearest neighbour imputation for a given variable
knn.imputation = function (dd, variable, varname, k)
{  
  aux = subset (dd, select = names(dd)[names(dd) != varname])
  aux1 = aux[!is.na(variable),]
  aux2 = aux[is.na(variable),]

  # Neither of aux1, aux2 can contain NAs
  knn.inc = knn (aux1,aux2, variable[!is.na(variable)], k)
  variable[is.na(variable)] = knn.inc
  variable
}

# This are the variables which values where substituted by dummy values.
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- clev[,!(names(clev) %in% dummy)]

# knn imputation for clev
na.names <- names(clev)[-much.na.cols(clev, 0)]
for (name in na.names){
  clev[, name][clev[, name] == -9] <- NA
  clev[, name] <- knn.imputation(clev, clev[,name], name, 7)
}

Now, we analyse the correlations among variables since it will have impact on later models.

corr.factors <- cor(clev)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
    row col
V55  34  12
V57  36  14
V31  21  20
V29  20  21
V20  12  34
V22  14  36
rm.correlated <- c("V57", "V55")
clev <- clev[,!(names(clev) %in% rm.correlated)]

factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}

# Dummy level gets replaced
clev$V25[clev$V25 == 2] <- 1
clev$V25 <- droplevels(clev$V25)
levels(clev$V25)
[1] "0" "1"

This is the dataset after the treatment for missings.

summary(clev)
       V3        V4      V9           V10        V11          V12       
 Min.   :29.00   0: 91   1: 22   Min.   : 94.0   0:108   Min.   :126.0  
 1st Qu.:48.00   1:191   2: 43   1st Qu.:120.0   1:174   1st Qu.:213.0  
 Median :55.00           3: 84   Median :130.0           Median :244.0  
 Mean   :54.41           4:133   Mean   :131.6           Mean   :249.1  
 3rd Qu.:61.00                   3rd Qu.:140.0           3rd Qu.:277.0  
 Max.   :77.00                   Max.   :200.0           Max.   :564.0  
                                                                        
      V14             V15        V16     V18     V19          V20    
 Min.   : 0.00   Min.   : 0.00   0:240   0:107   0:138   1      :38  
 1st Qu.: 0.00   1st Qu.: 0.00   1: 42   1:175   1:  2   12     :33  
 Median :10.00   Median :15.00                   2:142   2      :31  
 Mean   :16.64   Mean   :15.08                           8      :30  
 3rd Qu.:30.00   3rd Qu.:30.00                           6      :26  
 Max.   :99.00   Max.   :54.00                           11     :25  
                                                         (Other):99  
      V21      V22     V23     V24     V25     V26     V27          V29        
 4      : 13   81:67   0:271   0:186   0:211   0:252   0:248   Min.   : 1.800  
 15     : 13   82:96   1: 11   1: 95   1: 71   1: 30   1: 34   1st Qu.: 6.500  
 20     : 13   83:86           2:  1                           Median : 8.500  
 13     : 12   84:33                                           Mean   : 8.418  
 16     : 12                                                   3rd Qu.:10.075  
 21     : 12                                                   Max.   :15.000  
 (Other):207                                                                   
      V31              V32             V33              V34       
 Min.   : 3.000   Min.   : 71.0   Min.   : 40.00   Min.   : 84.0  
 1st Qu.: 7.000   1st Qu.:133.2   1st Qu.: 65.00   1st Qu.:154.0  
 Median : 9.000   Median :153.5   Median : 74.00   Median :168.0  
 Mean   : 9.754   Mean   :149.8   Mean   : 75.12   Mean   :168.1  
 3rd Qu.:12.000   3rd Qu.:165.8   3rd Qu.: 84.00   3rd Qu.:184.0  
 Max.   :18.000   Max.   :202.0   Max.   :119.00   Max.   :232.0  
                                                                  
      V35              V37         V38     V39          V40        V41    
 Min.   : 26.00   Min.   : 50.00   0:190   0:276   Min.   :0.000   1:135  
 1st Qu.: 70.00   1st Qu.: 80.00   1: 92   1:  6   1st Qu.:0.000   2:129  
 Median : 80.00   Median : 85.00                   Median :0.800   3: 18  
 Mean   : 78.74   Mean   : 84.95                   Mean   :1.027          
 3rd Qu.: 85.00   3rd Qu.: 90.00                   3rd Qu.:1.600          
 Max.   :120.00   Max.   :110.00                   Max.   :6.200          
                                                                          
      V43             V44         V51          V56      V58     V59     V60    
 Min.   : 24.0   Min.   :0.0000   1:  2   5      : 14   0:157   1:270   1:242  
 1st Qu.: 92.0   1st Qu.:0.0000   3:159   21     : 14   1: 50   2: 12   2: 40  
 Median :118.0   Median :0.0000   6: 14   14     : 12   2: 31                  
 Mean   :123.6   Mean   :0.6702   7:107   1      : 11   3: 32                  
 3rd Qu.:152.8   3rd Qu.:1.0000           17     : 11   4: 12                  
 Max.   :270.0   Max.   :3.0000           30     : 11                          
                                          (Other):209                          
 V61     V63     V65     V67     V68    
 1:224   1:238   1:236   1:233   1:246  
 2: 58   2: 44   2: 46   2: 49   2: 36  
                                        
                                        
                                        
                                        
                                        

2.1 Visualizations

par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    hist(clev[,i], main = names(clev)[i], xlab="Values")
  }
}

par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    boxplot(clev[,i], xlab = names(clev)[i])
  }
}

par(mfrow = c(3,3))
for(i in 1:ncol(clev)){
  if (is.factor(clev[,i])) {
    hist(as.numeric(as.character(clev[,i])), main = names(clev)[i], xlab="Values")
  }
}

names_num <- c()
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    names_num <- c(names_num, i)
  }
}
clev_numeric <- clev[,names_num]

clev_cor <- cor(clev_numeric)
which(clev_cor > 0.5 & clev_cor < 1, arr.ind = TRUE)
    row col
V34  10   2
V37  12   2
V15   5   4
V14   4   5
V31   7   6
V29   6   7
V32   8   7
V31   7   8
V10   2  10
V37  12  11
V10   2  12
V35  11  12
which(-clev_cor > 0.5 & -clev_cor < 1, arr.ind = TRUE)
     row col

2.2 Modification of values

par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
    qqnorm(clev_numeric[,i], main = c("Q-Q Plot: ", names(clev_numeric)[i]))
    qqline(clev_numeric[,i], col=2)
}

Boxcox

par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
  boxcox(lm(clev_numeric[,i]-min(clev_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(clev_numeric))[i])
}

par(mfrow = c(1,3))
#we treat them as special cases
aux <- clev_numeric[,"V14"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V15"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V40"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))

clev_sqrt <- c("V10", "V12", "V31", "V43")
clev_sqrt_especial <- c("V14", "V40")
clev_box <- clev_numeric
#box-cox transformation
for (i in 1:ncol(clev_box)){
  if (names(clev_box)[i] %in% clev_sqrt) {
    clev_box[,i] <- 2*sqrt(clev_box[,i]-min(clev_box[,i])+1e-6)
  } else if (names(clev_box)[i] %in% clev_sqrt_especial){
    clev_box[,i] <- 2*sqrt(clev_box[,i])
  }
}
aux <- names(clev_box)
clev_box <- data.frame(clev_box) 
colnames <- aux
par(mfrow = c(2,3))
for(i in 1:ncol(clev_box)){
    qqnorm(clev_box[,i], main = c("Q-Q Plot: ", names(clev_box)[i]))
    qqline(clev_box[,i], col=2)
}

For further models, normalisation will be useful.

And this is the final dataset after processing it.

clev[, colnames(clev_box)] <- clev_box[,colnames(clev_box)]
summary(clev)
       V3           V4      V9           V10           V11    
 Min.   :-2.80693   0: 91   1: 22   Min.   :-3.90175   0:108  
 1st Qu.:-0.70820   1:191   2: 43   1st Qu.:-0.55123   1:174  
 Median : 0.06502           3: 84   Median : 0.04091          
 Mean   : 0.00000           4:133   Mean   : 0.00000          
 3rd Qu.: 0.72778                   3rd Qu.: 0.55507          
 Max.   : 2.49513                   Max.   : 2.86408          
                                                              
      V12                 V14               V15           V16     V18     V19    
 Min.   :-4.643087   Min.   :-1.0513   Min.   :-0.98383   0:240   0:107   0:138  
 1st Qu.:-0.650497   1st Qu.:-1.0513   1st Qu.:-0.98383   1: 42   1:175   1:  2  
 Median : 0.006803   Median : 0.0726   Median :-0.00532                   2:142  
 Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.00000                          
 3rd Qu.: 0.617022   3rd Qu.: 0.8954   3rd Qu.: 0.97319                          
 Max.   : 4.315880   Max.   : 2.4850   Max.   : 2.53880                          
                                                                                 
      V20          V21      V22     V23     V24     V25     V26     V27    
 1      :38   4      : 13   81:67   0:271   0:186   0:211   0:252   0:248  
 12     :33   15     : 13   82:96   1: 11   1: 95   1: 71   1: 30   1: 34  
 2      :31   20     : 13   83:86           2:  1                          
 8      :30   13     : 12   84:33                                          
 6      :26   16     : 12                                                  
 11     :25   21     : 12                                                  
 (Other):99   (Other):207                                                  
      V29                V31               V32               V33          
 Min.   :-2.55482   Min.   :-4.1234   Min.   :-3.4360   Min.   :-2.54549  
 1st Qu.:-0.74055   1st Qu.:-0.8588   1st Qu.:-0.7205   1st Qu.:-0.73334  
 Median : 0.03148   Median :-0.1247   Median : 0.1629   Median :-0.08097  
 Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
 3rd Qu.: 0.63946   3rd Qu.: 0.7743   3rd Qu.: 0.6973   3rd Qu.: 0.64389  
 Max.   : 2.54059   Max.   : 2.2000   Max.   : 2.2786   Max.   : 3.18089  
                                                                          
      V34                 V35                V37            V38     V39    
 Min.   :-3.565358   Min.   :-3.95648   Min.   :-3.687203   0:190   0:276  
 1st Qu.:-0.596232   1st Qu.:-0.65596   1st Qu.:-0.521933   1: 92   1:  6  
 Median :-0.002407   Median : 0.09416   Median : 0.005612                  
 Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.000000                  
 3rd Qu.: 0.676251   3rd Qu.: 0.46922   3rd Qu.: 0.533157                  
 Max.   : 2.712223   Max.   : 3.09464   Max.   : 2.643337                  
                                                                           
      V40          V41          V43                V44          V51    
 Min.   :-1.1986   1:135   Min.   :-3.73286   Min.   :-0.7158   1:  2  
 1st Qu.:-1.1986   2:129   1st Qu.:-0.53995   1st Qu.:-0.7158   3:159  
 Median : 0.1781   3: 18   Median : 0.02122   Median :-0.7158   6: 14  
 Mean   : 0.0000           Mean   : 0.00000   Mean   : 0.0000   7:107  
 3rd Qu.: 0.7484           3rd Qu.: 0.66073   3rd Qu.: 0.3522          
 Max.   : 2.6341           Max.   : 2.34044   Max.   : 2.4883          
                                                                       
      V56      V58     V59     V60     V61     V63     V65     V67     V68    
 5      : 14   0:157   1:270   1:242   1:224   1:238   1:236   1:233   1:246  
 21     : 14   1: 50   2: 12   2: 40   2: 58   2: 44   2: 46   2: 49   2: 36  
 14     : 12   2: 31                                                          
 1      : 11   3: 32                                                          
 17     : 11   4: 12                                                          
 30     : 11                                                                  
 (Other):209                                                                  

2.3. Feature extraction

Separe train and test data, seed for reproducibility.

set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]

Extract PCA features.

pca <- princomp(train_num)
screeplot(pca)

summary(pca)
Importance of components:
                          Comp.1    Comp.2    Comp.3     Comp.4
Standard deviation     1.8444732 1.5577035 1.3875402 1.21213461
Proportion of Variance 0.2297381 0.1638543 0.1300108 0.09921788
Cumulative Proportion  0.2297381 0.3935924 0.5236032 0.62282104
                           Comp.5     Comp.6     Comp.7    Comp.8
Standard deviation     1.01035405 0.97964763 0.85803949 0.7938423
Proportion of Variance 0.06893431 0.06480791 0.04971676 0.0425556
Cumulative Proportion  0.69175535 0.75656326 0.80628002 0.8488356
                           Comp.9    Comp.10    Comp.11
Standard deviation     0.73133601 0.69242679 0.62465621
Proportion of Variance 0.03611787 0.03237695 0.02634938
Cumulative Proportion  0.88495350 0.91733045 0.94367983
                          Comp.12    Comp.13    Comp.14
Standard deviation     0.57654420 0.50756673 0.43609672
Proportion of Variance 0.02244675 0.01739701 0.01284263
Cumulative Proportion  0.96612658 0.98352359 0.99636622
                           Comp.15
Standard deviation     0.231971913
Proportion of Variance 0.003633784
Cumulative Proportion  1.000000000

Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) * 2

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

V1, V59, V57 creates many problems

problematic <- c("V57", "V59")
train <- train[, !(names(train) %in% problematic)]
fda <- lda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

train$LD1 <- loadings[,1]
train$LD2 <- loadings[,2]
train$LD3 <- loadings[,3]
train$LD4 <- loadings[,4]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
test$LD2 <- fda_test$x[,2]
test$LD3 <- fda_test$x[,3]
test$LD4 <- fda_test$x[,4]

Análisis por correspondencias

ac <- mjca(clev[,names(clev) %in% factores], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")


ac_ind <- mjca(train[,names(train) %in% factores], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))


mca.features <- ac_ind$rowcoord

3. Resampling protocol

Principal utility function for doing cross-validation.

Create the CV function for any model with associated predict and update functions.

cross.validation <- function(data, target, model, times, nfolds, need.class){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- update(model, data=tr)
      pred <- predict(model, newdata=va)
      if (need.class){
        pred <- pred$class
      }
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}

Cross-validation for k-Nearest Neighbour

cross.validation.knn <- function(data, target, times, nfolds, K){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      pred <- knn(tr, va, target[-val], k = K)

      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
  }
  mean(err.total)
}

Cross-validation Naive-Bayes

cross.validation.naive <- function(data, target, model, times, nfolds){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- naiveBayes(V58~.-LD1-LD2-LD3-LD4, data=tr)
      pred <- predict(model, newdata=va)
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}

4. Models

The models we are going to use are: - LDA - QDA - RDA - k-NN - Naïve Bayes - GLM

rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
cross.validation.naive(train, train$V58, naive.model, 10, 10)
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")

err
 [1] 0.1519883 0.1965205 0.1950585 0.2004678 0.2073977 0.2152924
 [7] 0.2184795 0.2276023 0.2313450 0.2323392 0.2419883 0.2456433
[13] 0.2642398 0.2690351 0.2743275 0.2791228 0.2871053 0.2955556
[19] 0.3003801 0.3050877
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)

Test error

rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
    Pred
True  0  1  2  3  4
   0 40  5  0  0  0
   1  6 13  1  0  2
   2  0  2  6  4  1
   3  0  0  1  9  0
   4  0  0  0  1  3
(err.test <- 1-sum(diag(err.table))/sum(err.table))
[1] 0.2446809

Parte 2

We will now use model on a dataset not yet investigated: Hungary.

hung <- read.table("../data/processed.hungarian.data", header=F, sep=',', na.strings="?")
summary(hung)
       V1              V2               V3              V4       
 Min.   :28.00   Min.   :0.0000   Min.   :1.000   Min.   : 92.0  
 1st Qu.:42.00   1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:120.0  
 Median :49.00   Median :1.0000   Median :3.000   Median :130.0  
 Mean   :47.83   Mean   :0.7245   Mean   :2.983   Mean   :132.6  
 3rd Qu.:54.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
 Max.   :66.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
                                                  NA's   :1      
       V5              V6                V7        
 Min.   : 85.0   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:209.0   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :243.0   Median :0.00000   Median :0.0000  
 Mean   :250.8   Mean   :0.06993   Mean   :0.2184  
 3rd Qu.:282.5   3rd Qu.:0.00000   3rd Qu.:0.0000  
 Max.   :603.0   Max.   :1.00000   Max.   :2.0000  
 NA's   :23      NA's   :8         NA's   :1       
       V8              V9              V10        
 Min.   : 82.0   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:122.0   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :140.0   Median :0.0000   Median :0.0000  
 Mean   :139.1   Mean   :0.3038   Mean   :0.5861  
 3rd Qu.:155.0   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :190.0   Max.   :1.0000   Max.   :5.0000  
 NA's   :1       NA's   :1                        
      V11             V12           V13             V14        
 Min.   :1.000   Min.   :0     Min.   :3.000   Min.   :0.0000  
 1st Qu.:2.000   1st Qu.:0     1st Qu.:5.250   1st Qu.:0.0000  
 Median :2.000   Median :0     Median :6.000   Median :0.0000  
 Mean   :1.894   Mean   :0     Mean   :5.643   Mean   :0.3605  
 3rd Qu.:2.000   3rd Qu.:0     3rd Qu.:7.000   3rd Qu.:1.0000  
 Max.   :3.000   Max.   :0     Max.   :7.000   Max.   :1.0000  
 NA's   :190     NA's   :291   NA's   :266                     

Missing values treatment

rm.var <- c("V11", "V12", "V13")
hung <- hung[,!(names(hung) %in% rm.var)]
hung[is.na(hung)] <- -9
var.imputable <- c("V4", "V5", "V6", "V7", "V8", "V9")
for (nom in var.imputable){
  hung[, nom][hung[, nom] == -9] <- NA
  variable <- hung[, nom]
  hung[, nom] <- knn.imputation(hung, variable, nom, 7)
}
var.factor <- c("V2", "V3", "V6", "V7", "V9", "V14")
for (nom in var.factor){
  hung[, nom] <- as.factor(as.character(hung[,nom]))
}
summary(hung)
       V1        V2      V3            V4              V5       
 Min.   :28.00   0: 81   1: 11   Min.   : 27.0   Min.   :  4.0  
 1st Qu.:42.00   1:213   2:106   1st Qu.:120.0   1st Qu.:198.0  
 Median :49.00           3: 54   Median :130.0   Median :237.0  
 Mean   :47.83           4:123   Mean   :132.2   Mean   :236.6  
 3rd Qu.:54.00                   3rd Qu.:140.0   3rd Qu.:277.0  
 Max.   :66.00                   Max.   :200.0   Max.   :603.0  
 V6      V7            V8        V9           V10         V14    
 0:266   0:235   Min.   : 54.0   0:204   Min.   :0.0000   0:188  
 1: 28   1: 53   1st Qu.:122.0   1: 89   1st Qu.:0.0000   1:106  
         2:  6   Median :140.0   2:  1   Median :0.0000          
                 Mean   :138.8           Mean   :0.5861          
                 3rd Qu.:155.0           3rd Qu.:1.0000          
                 Max.   :190.0           Max.   :5.0000          

Gaussianity treatment

names_num <- c()
for(i in 1:ncol(hung)){
  if (!is.factor(hung[,i])) {
    names_num <- c(names_num, i)
  }
}
hung_numeric <- hung[,names_num]

hung_cor <- cor(hung_numeric)
which(hung_cor > 0.5 & hung_cor < 1, arr.ind = TRUE)
     row col
which(-hung_cor > 0.5 & -hung_cor < 1, arr.ind = TRUE)
     row col
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
    qqnorm(hung_numeric[,i], main = c("Q-Q Plot: ", names(hung_numeric)[i]))
    qqline(hung_numeric[,i], col=2)
}

Boxcox

par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
  boxcox(lm(hung_numeric[,i]-min(hung_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(hung_numeric))[i])
}

We see var 10 requires a logaritmic transformation.

hung[,"V10"] <- log(1+hung[,"V10"])
qqnorm(hung[,"V10"])
qqline(hung[,"V10"], col=2)

Feature extraction

Separe train and test data, seed for reproducibility.

set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]

Extract PCA features.

pca <- princomp(train_num)
screeplot(pca)

summary(pca)
Importance of components:
                           Comp.1      Comp.2      Comp.3
Standard deviation     86.0161788 24.47912423 17.80405417
Proportion of Variance  0.8849996  0.07167612  0.03791583
Cumulative Proportion   0.8849996  0.95667568  0.99459151
                            Comp.4       Comp.5
Standard deviation     6.709559731 0.4448637716
Proportion of Variance 0.005384815 0.0000236721
Cumulative Proportion  0.999976328 1.0000000000

Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) *0.3

col.class <- as.numeric(train$V14)
col.class[col.class==0] <- "red"
col.class[col.class==1] <- "blue"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

fda <- lda(V14~., data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

train$LD1 <- loadings[,1]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]

Análisis por correspondencias

ac <- mjca(hung[,names(hung) %in% var.factor], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")


ac_ind <- mjca(train[,names(train) %in% var.factor], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))


mca.features <- ac_ind$rowcoord

---
title: "R Notebook"
output: html_notebook
---


# Machine learning project
## Authors: Jose Pérez Cano & Álvaro Ribot Barrado

### 0. Libraries

```{r}
install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
```

```{r}
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# 3d
library(rgl)

# LASSO
library(Matrix)
library(glmnet)

# Correspondence analysis
library(ca)
```


### 1. Read data

```{r}
set.seed(2105)
setwd("../data")
clev <- read.csv("cleveland.csv", header=F)
hung <- read.csv("hungarian.csv", header=F)
va <- read.csv("long-beach-va.csv", header=F)
switz <- read.csv("switzerland.csv", header=F)

clev$location <- "cleveland"
hung$location <- "hungarian"
va$location <- "long-beach-va"
switz$location <- "switzerland"

heart1 <- rbind(clev, hung)
heart2 <- rbind(va, switz)
heart <- rbind(heart1, heart2)
head(heart)
```


### 2. Preprocess data
We apply clustering and several plotting techniques to have an idea of the dataset. In case there are NAs we will use k-NN for imputation. 

To extract new features we will apply PCA and FDA and keep this new features and components apart.

```{r}
# It says which columns are all missings
# The index are returned in negative to eliminate them
na.columns <- function(dd){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (min(dd[,i]) == max(dd[,i]) & min(dd[,i])==-9){
      rmlist <- c(rmlist, i)
    }
  }
  -rmlist
}

clev <- clev[,na.columns(clev)]

# Returns columns with more NA than a given threshold, also in negative
much.na.cols <- function(dd, threshold){
  rmlist <- c()
  for (i in 1:ncol(dd)){
    if (sum(dd[,i]==-9) > threshold){
      rmlist <- c(rmlist, i)
    }
  }   
  -rmlist  
}

clev <- clev[, much.na.cols(clev, 60)]

# Applies k-nearest neighbour imputation for a given variable
knn.imputation = function (dd, variable, varname, k)
{  
  aux = subset (dd, select = names(dd)[names(dd) != varname])
  aux1 = aux[!is.na(variable),]
  aux2 = aux[is.na(variable),]

  # Neither of aux1, aux2 can contain NAs
  knn.inc = knn (aux1,aux2, variable[!is.na(variable)], k)
  variable[is.na(variable)] = knn.inc
  variable
}

# This are the variables which values where substituted by dummy values.
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- clev[,!(names(clev) %in% dummy)]

# knn imputation for clev
na.names <- names(clev)[-much.na.cols(clev, 0)]
for (name in na.names){
  clev[, name][clev[, name] == -9] <- NA
  clev[, name] <- knn.imputation(clev, clev[,name], name, 7)
}
```

Now, we analyse the correlations among variables since it will have impact on later models.

```{r}
corr.factors <- cor(clev)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)

rm.correlated <- c("V57", "V55")
clev <- clev[,!(names(clev) %in% rm.correlated)]

factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}

# Dummy level gets replaced
clev$V25[clev$V25 == 2] <- 1
clev$V25 <- droplevels(clev$V25)
levels(clev$V25)
```

This is the dataset after the treatment for missings.

```{r}
summary(clev)
```


#### 2.1 Visualizations

```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    hist(clev[,i], main = names(clev)[i], xlab="Values")
  }
}
```


```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    boxplot(clev[,i], xlab = names(clev)[i])
  }
}
```


```{r}
par(mfrow = c(3,3))
for(i in 1:ncol(clev)){
  if (is.factor(clev[,i])) {
    hist(as.numeric(as.character(clev[,i])), main = names(clev)[i], xlab="Values")
  }
}
```


```{r}
names_num <- c()
for(i in 1:ncol(clev)){
  if (!is.factor(clev[,i])) {
    names_num <- c(names_num, i)
  }
}
clev_numeric <- clev[,names_num]

clev_cor <- cor(clev_numeric)
which(clev_cor > 0.5 & clev_cor < 1, arr.ind = TRUE)
which(-clev_cor > 0.5 & -clev_cor < 1, arr.ind = TRUE)
```


#### 2.2 Modification of values

```{r}
par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
    qqnorm(clev_numeric[,i], main = c("Q-Q Plot: ", names(clev_numeric)[i]))
    qqline(clev_numeric[,i], col=2)
}
```


Boxcox

```{r}
par(mfrow = c(2,3))
for(i in 1:length(clev_numeric)){
  boxcox(lm(clev_numeric[,i]-min(clev_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(clev_numeric))[i])
}
```

```{r}
par(mfrow = c(1,3))
#we treat them as special cases
aux <- clev_numeric[,"V14"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V15"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
aux <- clev_numeric[,"V40"]
aux <- aux[aux !=0]
boxcox(lm(aux~1),lambda = seq(-2, 1.5, by=0.1))
```


```{r}
clev_sqrt <- c("V10", "V12", "V31", "V43")
clev_sqrt_especial <- c("V14", "V40")
clev_box <- clev_numeric
#box-cox transformation
for (i in 1:ncol(clev_box)){
  if (names(clev_box)[i] %in% clev_sqrt) {
    clev_box[,i] <- 2*sqrt(clev_box[,i]-min(clev_box[,i])+1e-6)
  } else if (names(clev_box)[i] %in% clev_sqrt_especial){
    clev_box[,i] <- 2*sqrt(clev_box[,i])
  }
}
aux <- names(clev_box)
clev_box <- data.frame(clev_box) 
colnames <- aux
```


```{r}
par(mfrow = c(2,3))
for(i in 1:ncol(clev_box)){
    qqnorm(clev_box[,i], main = c("Q-Q Plot: ", names(clev_box)[i]))
    qqline(clev_box[,i], col=2)
}
```

For further models, normalisation will be useful.

```{r}
clev_box <- apply(clev_box, 2, scale)
```


And this is the final dataset after processing it.

```{r}
clev[, colnames(clev_box)] <- clev_box[,colnames(clev_box)]
summary(clev)
```


#### 2.3. Feature extraction

Separe train and test data, seed for reproducibility.

```{r}
set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]
```

```{r}
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]
```

Extract PCA features.

```{r}
pca <- princomp(train_num)
screeplot(pca)
summary(pca)
```


```{r}
biplot(pca)
```

```{r}
Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) * 2

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))
```


V1, V59, V57 creates many problems

```{r}
problematic <- c("V57", "V59")
train <- train[, !(names(train) %in% problematic)]
fda <- lda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))
```


```{r}
train$LD1 <- loadings[,1]
train$LD2 <- loadings[,2]
train$LD3 <- loadings[,3]
train$LD4 <- loadings[,4]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
test$LD2 <- fda_test$x[,2]
test$LD3 <- fda_test$x[,3]
test$LD4 <- fda_test$x[,4]
```


Análisis por correspondencias

```{r}
ac <- mjca(clev[,names(clev) %in% factores], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")

ac_ind <- mjca(train[,names(train) %in% factores], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","green", "blue", "yellow", "purple"), legend=c('0','1','2','3', '4'))

mca.features <- ac_ind$rowcoord
```


### 3. Resampling protocol
Principal utility function for doing cross-validation. 

Create the CV function for any model with associated predict and update functions.

```{r}
cross.validation <- function(data, target, model, times, nfolds, need.class){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- update(model, data=tr)
      pred <- predict(model, newdata=va)
      if (need.class){
        pred <- pred$class
      }
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}
```


Cross-validation for k-Nearest Neighbour

```{r}
cross.validation.knn <- function(data, target, times, nfolds, K){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      pred <- knn(tr, va, target[-val], k = K)

      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
  }
  mean(err.total)
}
```


Cross-validation Naive-Bayes

```{r}
cross.validation.naive <- function(data, target, model, times, nfolds){
  set.seed(0202)
  CV.folds <- generateCVRuns(target, ntimes=times, nfold=nfolds, stratified=TRUE)

  err.total <- c()
  for (i in 1:times){
    err.onetime <- c()
    for (j in 1:nfolds){
      print(paste0("Fold: ", j))
      val <- unlist(CV.folds[[i]][[j]])
      
      tr <- data[-val,]
      va <- data[val,]

      model <- naiveBayes(V58~.-LD1-LD2-LD3-LD4, data=tr)
      pred <- predict(model, newdata=va)
      
      err.table <- table(True=target[val], Predicted=pred)
      err <- 1-sum(diag(err.table))/sum(err.table)
      err.onetime <- c(err.onetime, err)
    }
    err.total <- c(err.total, mean(err.onetime))
    print(paste0("Iteration ", i, ", mean error: ", mean(err.onetime)))
  }
  mean(err.total)
}
```


### 4. Models
The models we are going to use are: 
  - LDA
  - QDA
  - RDA
  - k-NN
  - Naïve Bayes
  - GLM
 

```{r}
rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
```


```{r}
cross.validation(train, train$V58, rda.model, 10, 10, T)
```


```{r}
rda.model.fda <- rda(V58~.,data=train)
```


```{r}
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
```


```{r}
cross.validation.naive(train, train$V58, naive.model, 10, 10)
```


```{r}
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
```


```{r}
plot(err, type = "l")
err
```


```{r}
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
```


## Test error

```{r}
rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
(err.test <- 1-sum(diag(err.table))/sum(err.table))
```

# Parte 2

We will now use model on a dataset not yet investigated: Hungary.

```{r}
hung <- read.table("../data/processed.hungarian.data", header=F, sep=',', na.strings="?")
summary(hung)
```

## Missing values treatment

```{r}
rm.var <- c("V11", "V12", "V13")
hung <- hung[,!(names(hung) %in% rm.var)]
```

```{r}
hung[is.na(hung)] <- -9
var.imputable <- c("V4", "V5", "V6", "V7", "V8", "V9")
for (nom in var.imputable){
  hung[, nom][hung[, nom] == -9] <- NA
  variable <- hung[, nom]
  hung[, nom] <- knn.imputation(hung, variable, nom, 7)
}
```

```{r}
var.factor <- c("V2", "V3", "V6", "V7", "V9", "V14")
for (nom in var.factor){
  hung[, nom] <- as.factor(as.character(hung[,nom]))
}

# Level 2 has few values
hung$V7[hung$V7 == 2] <- 1
hung$V7 <- droplevels(hung$V7)
levels(hung$V7)

# Level 2 has one anomal value
hung$V9[hung$V9 == 2] <- 1
hung$V9 <- droplevels(hung$V9)
levels(hung$V9)
```


```{r}
summary(hung)
```

## Gaussianity treatment

```{r}
names_num <- c()
for(i in 1:ncol(hung)){
  if (!is.factor(hung[,i])) {
    names_num <- c(names_num, i)
  }
}
hung_numeric <- hung[,names_num]

hung_cor <- cor(hung_numeric)
which(hung_cor > 0.5 & hung_cor < 1, arr.ind = TRUE)
which(-hung_cor > 0.5 & -hung_cor < 1, arr.ind = TRUE)
```


```{r}
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
    qqnorm(hung_numeric[,i], main = c("Q-Q Plot: ", names(hung_numeric)[i]))
    qqline(hung_numeric[,i], col=2)
}
```


Boxcox

```{r}
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
  boxcox(lm(hung_numeric[,i]-min(hung_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(hung_numeric))[i])
}
```

We see var 10 requires a logaritmic transformation.

```{r}
hung[,"V10"] <- log(1+hung[,"V10"])
``` 

```{r}
qqnorm(hung[,"V10"])
qqline(hung[,"V10"], col=2)
```

## Feature extraction

Separe train and test data, seed for reproducibility.

```{r}
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
```

```{r}
names_num <- c()
for(i in 1:ncol(train)){
  if (!is.factor(train[,i])) {
    names_num <- c(names_num, i)
  }
}
train_num <- train[,names_num]
```

Extract PCA features.

```{r}
pca <- princomp(train_num)
screeplot(pca)
summary(pca)
```


```{r}
biplot(pca)
```

```{r}
Fp <- pca$scores
Gs <- pca$loadings

Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) *0.3

col.class <- as.numeric(train$V14)
col.class[col.class==0] <- "red"
col.class[col.class==1] <- "blue"

plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
```

```{r}
fda <- lda(V14~., data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
```


```{r}
train$LD1 <- loadings[,1]

fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
```


Análisis por correspondencias

```{r}
ac <- mjca(hung[,names(hung) %in% var.factor], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")

ac_ind <- mjca(train[,names(train) %in% var.factor], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))

mca.features <- ac_ind$rowcoord
```

## 
